Pokemon Clustering Analysis

This report presents the results of spectral clustering analysis on Pokemon data, combining statistics, text embeddings, and image features.

Load Required Libraries

library(tidyverse)
library(plotly)

Load Data

# Load clustering results
clustering_results <- read_csv("q1_clustering/output/spectral_clustering_results.csv")

# Load spectral embedding (20D space where clustering was performed)
spectral_embedding <- readRDS("q1_clustering/output/spectral_embedding.rds")

cat(sprintf("Loaded %d Pokemon with cluster assignments\n", nrow(clustering_results)))
## Loaded 948 Pokemon with cluster assignments
cat(sprintf("Loaded spectral embedding: %d Pokemon x %d dimensions\n",
            nrow(spectral_embedding), ncol(spectral_embedding)))
## Loaded spectral embedding: 948 Pokemon x 20 dimensions

Extract Visualization Dimensions from Spectral Embedding

# Use first three spectral eigenvectors for 3D visualization
# These are the most informative dimensions from the graph Laplacian
spectral_data_3d <- as_tibble(spectral_embedding[, 1:3]) %>%
  rename(Dim1 = 1, Dim2 = 2, Dim3 = 3) %>%
  mutate(name = rownames(spectral_embedding))

cat("Using first 3 dimensions of spectral embedding for 3D visualization\n")
## Using first 3 dimensions of spectral embedding for 3D visualization
cat("These correspond to the smallest non-zero eigenvalues of the graph Laplacian\n")
## These correspond to the smallest non-zero eigenvalues of the graph Laplacian

Merge with Cluster Assignments

# Merge spectral embedding with cluster assignments
plot_data <- spectral_data_3d %>%
  left_join(clustering_results, by = "name") %>%
  mutate(cluster = factor(cluster))

cat(sprintf("Created plot data with %d Pokemon\n", nrow(plot_data)))
## Created plot data with 948 Pokemon
cat(sprintf("Number of clusters: %d\n", n_distinct(plot_data$cluster)))
## Number of clusters: 18

Cluster Size Summary

cluster_summary <- plot_data %>%
  count(cluster) %>%
  arrange(desc(n))

knitr::kable(cluster_summary,
             col.names = c("Cluster", "Number of Pokemon"),
             caption = "Cluster Sizes")
Cluster Sizes
Cluster Number of Pokemon
18 135
5 109
17 82
9 73
13 59
1 57
16 51
14 50
2 49
12 49
15 48
6 47
7 41
10 33
11 33
3 30
4 1
8 1

Interactive 3D Spectral Clustering Visualization

# Define high-contrast color palette for 18 clusters
high_contrast_colors <- c(
  "#e6194b", "#3cb44b", "#ffe119", "#4363d8", "#f58231",
  "#911eb4", "#46f0f0", "#f032e6", "#bcf60c", "#fabebe",
  "#008080", "#e6beff", "#9a6324", "#fffac8", "#800000",
  "#aaffc3", "#808000", "#ffd8b1"
)

# Create interactive 3D plotly plot
p <- plot_ly(
  data = plot_data,
  x = ~Dim1,
  y = ~Dim2,
  z = ~Dim3,
  color = ~cluster,
  colors = high_contrast_colors,
  text = ~name,
  type = "scatter3d",
  mode = "markers",
  marker = list(
    size = 4,
    sizemode = 'diameter',
    opacity = 0.85,
    line = list(
      color = "rgba(0, 0, 0, 0.8)",
      width = 1
    )
  ),
  hovertemplate = paste(
    "<b>%{text}</b><br>",
    "Cluster: %{marker.color}<br>",
    "Dim 1: %{x:.3f}<br>",
    "Dim 2: %{y:.3f}<br>",
    "Dim 3: %{z:.3f}<br>",
    "<extra></extra>"
  )
) %>%
  layout(
    title = list(
      text = "Pokemon Spectral Clustering - 3D Visualization in Spectral Embedding Space (k=18)<br><sub>First 3 eigenvectors from normalized graph Laplacian</sub>",
      font = list(size = 16)
    ),
    scene = list(
      xaxis = list(
        title = "Spectral Dim 1 (2nd smallest eigenvalue)",
        gridcolor = "#E5E5E5",
        showbackground = TRUE,
        backgroundcolor = "#F8F8F8"
      ),
      yaxis = list(
        title = "Spectral Dim 2 (3rd smallest eigenvalue)",
        gridcolor = "#E5E5E5",
        showbackground = TRUE,
        backgroundcolor = "#F8F8F8"
      ),
      zaxis = list(
        title = "Spectral Dim 3 (4th smallest eigenvalue)",
        gridcolor = "#E5E5E5",
        showbackground = TRUE,
        backgroundcolor = "#F8F8F8"
      ),
      camera = list(
        eye = list(x = 1.5, y = 1.5, z = 1.3)
      )
    ),
    paper_bgcolor = "white",
    legend = list(
      title = list(text = "Cluster"),
      orientation = "v",
      x = 1.02,
      y = 1
    )
  )

p

Cluster Interpretation

# Sample Pokemon from each cluster
set.seed(42)
cluster_samples <- plot_data %>%
  group_by(cluster) %>%
  slice_sample(n = 5) %>%
  ungroup() %>%
  select(cluster, name, Dim1, Dim2, Dim3) %>%
  arrange(cluster, name)

knitr::kable(cluster_samples,
             col.names = c("Cluster", "Pokemon", "Dim 1", "Dim 2", "Dim 3"),
             caption = "Sample Pokemon from Each Cluster (up to 5 per cluster)",
             digits = 3)
Sample Pokemon from Each Cluster (up to 5 per cluster)
Cluster Pokemon Dim 1 Dim 2 Dim 3
1 Aerodactyl -0.008 -0.003 -0.016
1 Crobat -0.026 -0.002 0.016
1 Natu -0.032 0.019 -0.012
1 Skarmory -0.045 0.033 0.014
1 Tranquill 0.005 0.003 -0.016
2 Cofagrigus 0.056 -0.048 -0.006
2 Duskull 0.081 -0.046 0.007
2 Golurk 0.019 0.011 0.025
2 Phantump 0.077 -0.036 -0.039
2 Spiritomb 0.120 -0.017 -0.015
3 Dracovish 0.008 -0.002 -0.004
3 Dracozolt 0.004 0.013 0.003
3 Palkia 0.038 0.016 -0.017
3 Reshiram 0.046 0.051 -0.064
3 Xerneas 0.007 0.019 -0.012
4 Eternatus -0.027 -0.128 0.084
5 Arrokuda -0.004 0.012 0.017
5 Feebas -0.006 -0.010 0.023
5 Golduck 0.004 -0.006 0.013
5 Inteleon -0.003 -0.005 0.001
5 Slowbro -0.066 0.025 -0.004
6 Carbink -0.001 -0.004 0.005
6 Grimmsnarl -0.016 0.088 -0.055
6 Primarina 0.007 0.045 -0.015
6 Spritzee -0.011 0.067 -0.023
6 Togetic -0.009 0.046 -0.013
7 Carvanha 0.037 0.025 -0.014
7 Poochyena 0.028 -0.001 -0.009
7 Scrafty 0.028 -0.024 0.046
7 Thievul 0.021 -0.008 0.011
7 Ting-Lu 0.053 0.039 0.013
8 Ditto -0.028 0.029 0.004
9 Cascoon 0.010 0.016 -0.003
9 Dottler -0.027 0.006 0.042
9 Illumise -0.010 -0.007 0.042
9 Rabsca -0.021 0.008 0.022
9 Surskit 0.022 0.005 0.014
10 Articuno -0.011 -0.078 -0.068
10 Cryogonal -0.048 -0.140 -0.017
10 Glaceon -0.027 -0.065 -0.044
10 Kyurem -0.022 -0.073 -0.097
10 Snover -0.040 -0.068 -0.073
11 Appletun -0.026 0.035 -0.033
11 Axew -0.052 0.014 -0.012
11 Bagon -0.032 0.018 -0.014
11 Gible -0.047 0.067 0.032
11 Noibat -0.035 0.036 -0.013
12 Conkeldurr -0.033 0.005 -0.014
12 Machoke -0.029 0.002 -0.013
12 Machop -0.028 0.004 -0.020
12 Mankey -0.005 -0.018 0.009
12 Passimian -0.016 -0.016 0.025
13 Armarouge -0.060 0.045 -0.023
13 Blaziken -0.009 0.027 -0.042
13 Carkol -0.001 0.011 -0.040
13 Slugma 0.034 0.030 -0.053
13 Torchic 0.016 0.031 -0.054
14 Armaldo 0.021 -0.008 -0.056
14 Garganacl -0.003 -0.011 -0.008
14 Mamoswine -0.047 -0.042 0.013
14 Rhyperior -0.006 0.041 0.024
14 Roggenrola -0.016 -0.009 -0.052
15 Beldum -0.083 0.000 0.000
15 Melmetal 0.008 0.047 -0.048
15 Metang -0.088 -0.002 0.011
15 Perrserker -0.147 0.009 0.045
15 Revavroom -0.069 -0.022 0.016
16 Chimecho -0.019 -0.012 0.027
16 Cosmoem 0.032 0.009 0.035
16 Espurr -0.044 -0.010 0.030
16 Jirachi -0.045 0.010 0.019
16 Wynaut -0.026 0.011 0.003
17 Arboliva 0.029 -0.023 -0.011
17 Chikorita -0.003 0.003 -0.026
17 Fomantis 0.013 0.009 -0.037
17 Jumpluff -0.001 0.014 0.016
17 Tangela 0.003 0.017 -0.020
18 Furfrou 0.028 -0.070 0.051
18 Luxray -0.003 0.014 0.031
18 Skuntank 0.019 -0.006 0.013
18 Spinda 0.005 -0.040 0.014
18 Stunfisk 0.025 0.039 0.055

Summary Statistics

summary_stats <- plot_data %>%
  group_by(cluster) %>%
  summarise(
    n = n(),
    mean_Dim1 = mean(Dim1),
    mean_Dim2 = mean(Dim2),
    mean_Dim3 = mean(Dim3),
    sd_Dim1 = sd(Dim1),
    sd_Dim2 = sd(Dim2),
    sd_Dim3 = sd(Dim3),
    .groups = "drop"
  ) %>%
  arrange(cluster)

knitr::kable(summary_stats,
             col.names = c("Cluster", "Size", "Mean D1", "Mean D2", "Mean D3", "SD D1", "SD D2", "SD D3"),
             caption = "Cluster Statistics in 3D Spectral Embedding Space",
             digits = 3)
Cluster Statistics in 3D Spectral Embedding Space
Cluster Size Mean D1 Mean D2 Mean D3 SD D1 SD D2 SD D3
1 57 -0.008 0.012 -0.011 0.020 0.017 0.020
2 49 0.049 -0.023 -0.011 0.025 0.023 0.028
3 30 0.043 0.031 -0.020 0.045 0.041 0.050
4 1 -0.027 -0.128 0.084 NA NA NA
5 109 0.002 0.002 0.004 0.017 0.020 0.025
6 47 -0.007 0.042 -0.003 0.023 0.022 0.025
7 41 0.033 -0.004 0.004 0.022 0.030 0.027
8 1 -0.028 0.029 0.004 NA NA NA
9 73 0.009 0.006 0.016 0.027 0.025 0.037
10 33 -0.049 -0.084 -0.056 0.030 0.019 0.025
11 33 -0.031 0.020 0.000 0.020 0.019 0.030
12 49 -0.017 -0.010 0.000 0.032 0.021 0.023
13 59 0.005 0.024 -0.029 0.022 0.017 0.018
14 50 -0.002 0.018 0.017 0.018 0.034 0.044
15 48 -0.037 0.005 0.002 0.032 0.021 0.027
16 51 -0.033 0.001 0.017 0.022 0.015 0.025
17 82 0.003 0.000 -0.011 0.015 0.013 0.019
18 135 0.011 -0.021 0.020 0.017 0.024 0.020

Methodology

Data Sources

  • Pokemon Statistics: Base stats, types, abilities, etc.
  • Text Embeddings: SVD of Pokemon descriptions
  • Image Features: PCA of Pokemon sprite images

Clustering Pipeline

  1. Feature Integration: Combined 2,730 features from stats, text, and images
  2. Distance Metric: Mahalanobis distance with shrinkage covariance
  3. Affinity Matrix: RBF kernel with σ = 0.5 × median distance
  4. Spectral Embedding: 20 eigenvectors from normalized graph Laplacian
  5. K-means Clustering: 18 clusters based on Calinski-Harabasz index
  6. Visualization: First 3 dimensions of spectral embedding (shown above)

Visualization Details

The interactive 3D plot shows Pokemon in the spectral embedding space - the actual space where k-means clustering was performed. The three dimensions correspond to: - Dim 1: 2nd smallest eigenvalue of graph Laplacian (1st is trivial ~0) - Dim 2: 3rd smallest eigenvalue of graph Laplacian - Dim 3: 4th smallest eigenvalue of graph Laplacian

Interactive features: - Rotate: Click and drag to rotate the view - Zoom: Scroll wheel or pinch to zoom in/out - Pan: Right-click and drag (or shift+drag) - Hover: Mouse over points to see Pokemon details - Filter: Click legend items to show/hide clusters - Reset: Double-click to reset view

Note: Marker sizes in 3D plotly plots are fixed in screen pixels and don’t scale with zoom. For best viewing: - Zoom in to see detail in dense cluster regions - Hide clusters via legend to reduce visual clutter - Rotate to find angles that separate overlapping clusters

This 3D spectral visualization is superior to PCA because: - Spectral embedding preserves graph structure and cluster separation - K-means was performed in this 20D space, so 3D projection is more representative - Captures manifold structure better than linear PCA - Additional dimension provides better cluster separation visibility

Results

  • Total Pokemon: 948
  • Number of Clusters: 18
  • Between/Total Variance: 55.66% (k-means on 20D spectral embedding)

Report generated on 2025-11-06 08:57:22.234287